import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
sns.set(color_codes = True)
# To enable plotting graphs in Jupyter notebook
%matplotlib inline
import scipy.stats as stats
import statsmodels.api as statm
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
from IPython.display import Image
#import pydotplus as pydot
from sklearn import tree
from os import system
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
pk_data = pd.read_csv("parkinsons.csv")
pk_data.head()
pk_data.tail()
print(pk_data.columns)
print(pk_data.shape)
- We have 23 independent variables and 1 dependent variable i.e. ‘status’ in the data set. Also, we got 195 rows which can be split into test & train datasets.
pk_data.dtypes
- Almost all atributes are numeric except name.
pk_data.info() # checking null values
pk_data.isna().apply(pd.value_counts) # checking missing value
val = pk_data.isnull().values.any()
if val == True:
print("Missing values present : ", pk_data.isnull().values.sum())
pk_data = pk_data.dropna()
else:
print("No missing values present")
pk_data.describe().T
pk_data.skew(axis = 0, skipna = True)
sns.pairplot(pk_data, diag_kind='kde',hue = "status")
plt.show()
corr = pk_data.corr()
plt.figure(figsize=(16, 10))
sns.heatmap(corr, annot=True, cmap = "YlGnBu")
plt.show()
# correlation coefficient values in each attributes.
correlation_values=pk_data.corr()['status']
correlation_values.abs().sort_values(ascending=False)
- Above is the correlation values in descending order, we have correaltion values in each attribute so we are going to drop from MDVP:RAP column to MDVP:Fhi(Hz) because it have less correlation with other columns.
- If we decrease the column count then accuracy will increase gradually because we are not keeping the irrelevant features.
pk_data['status'].value_counts()
colors = ['teal', 'yellow']
plt.title('Patient Status Distribution')
sns.countplot(x = 'status', data = pk_data, palette = colors)
plt.show()
We will use 70% of data for training and 30% for testing.
We are going to drop irrelavant column values from our dataset so that we can get better accuracy.
cols = ['MDVP:RAP','Jitter:DDP','DFA','NHR','MDVP:Fhi(Hz)','name','status']
X = pk_data.drop(cols , axis = 1)
Y = pk_data['status']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)
x_train.head()
print("{0:0.2f}% data is in training set".format((len(x_train)/len(pk_data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(pk_data.index)) * 100))
print("Original status True Values : {0} ({1:0.2f}%)".format(len(pk_data.loc[pk_data['status'] == 1]), (len(pk_data.loc[pk_data['status'] == 1])/len(pk_data.index)) * 100))
print("Original status False Values : {0} ({1:0.2f}%)".format(len(pk_data.loc[pk_data['status'] == 0]), (len(pk_data.loc[pk_data['status'] == 0])/len(pk_data.index)) * 100))
print("")
print("Training status True Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training status False Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test status True Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test status False Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
from sklearn.linear_model import LogisticRegression
# Fit the model on train
LogReg = LogisticRegression(solver="liblinear")
LogReg.fit(x_train, y_train)
#predict on test
y_predict = LogReg.predict(x_test)
coef_df = pd.DataFrame(LogReg.coef_)
coef_df['intercept'] = LogReg.intercept_
print(coef_df)
LogReg_score = LogReg.score(x_test, y_test)
print("Accuracy : %f" % LogReg_score)
# Classification Report
print(classification_report(y_test, y_predict))
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (8,6))
sns.heatmap(df_cm, annot=True, cmap='YlGnBu', fmt='g')
logR_curve = LogReg.predict_proba(x_test)
fpr1, tpr1, threshold1 = roc_curve(y_test, logR_curve[:,1])
roc_auc1 = auc(fpr1, tpr1)
print("Area under the ROC curvre : %f" % roc_auc1)
1) Here we got around 80% accuracy but we know from the data that the number of patient's percentage to the non-patient percentage is very less. Hence accuracy didn’t play a big role in determining how our model performed.
2) We got a recall for ‘1’ which in this case is 95%, So out of the total number of patients who actually diagnosed with the disease, our model is able to pick 95% of patients of them to be correctly predicted.
3) The area under roc curve determines how our model performs. In this case, the area comes out to be around 87% which is good.
from sklearn import preprocessing
column_names = pk_data.columns #get the column names
scaler = preprocessing.StandardScaler() #create the scaler object
scaled_x_train = scaler.fit_transform(x_train) #fit the data into scaler object
scaled_x_test = scaler.fit_transform(x_test)
sc_LogReg = LogisticRegression(solver="liblinear")
sc_LogReg.fit(scaled_x_train, y_train)
#predict on test
sc_y_predict = sc_LogReg.predict(scaled_x_test)
coef_df = pd.DataFrame(sc_LogReg.coef_)
coef_df['intercept'] = sc_LogReg.intercept_
print(coef_df)
sc_LogReg_score = sc_LogReg.score(scaled_x_train, y_train)
print("Accuracy : %f" % sc_LogReg_score)
# Classification Report
print(classification_report(y_test, sc_y_predict))
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, sc_y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (8,6))
sns.heatmap(df_cm, annot=True, cmap='YlGnBu', fmt='g')
sc_logR_curve = sc_LogReg.predict_proba(scaled_x_test)
sc_fpr1, sc_tpr11, sc_threshold1 = roc_curve(y_test, sc_logR_curve[:,1])
sc_roc_auc1 = auc(fpr1, tpr1)
print("Area under the ROC curvre : %f" % sc_roc_auc1)
1) Here we are with around 89% accuracy and the recall value is 97% which means our model did better in predicting True Positives with scaled data.
2) Also, the area under the curve is around 87%, which is same as previous value.
# loading library
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
# creating odd list of K for KNN
myList = list(range(1,20))
# subsetting just the odd ones
sc_neighbors = list(filter(lambda x: x % 2 != 0, myList))
# empty list that will hold accuracy scores
ac_scores = []
# perform accuracy metrics for values from 1,3,5....19
for k in sc_neighbors:
sc_kNN = KNeighborsClassifier(n_neighbors=k)
sc_kNN.fit(scaled_x_train, y_train)
# predict the response
sc_y_pred = sc_kNN.predict(scaled_x_test)
# evaluate accuracy
scores = accuracy_score(y_test, sc_y_pred)
ac_scores.append(scores)
# changing to misclassification error
MSE = [1 - x for x in ac_scores]
# determining best k
sc_optimal_k = sc_neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % sc_optimal_k)
import matplotlib.pyplot as plt
# plot misclassification error vs k
plt.plot(sc_neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()
#Use k=1 as the final model for prediction
sc_kNN = KNeighborsClassifier(n_neighbors = 1)
# fitting the model
sc_kNN.fit(scaled_x_train, y_train)
# predict the response
sc_y_pred = sc_kNN.predict(scaled_x_test)
sc_y_p = sc_kNN.predict(scaled_x_train)
# evaluate accuracy
print("Accuracy : %f" % accuracy_score(y_test, sc_y_pred))
# Classification Report
print(classification_report(y_test, sc_y_pred))
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, sc_y_pred, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (8,6))
sns.heatmap(df_cm, annot=True, cmap='YlGn', fmt='g')
sc_kNN_curve = sc_kNN.predict_proba(scaled_x_test)
sc_fpr2, sc_tpr2, sc_threshold2 = roc_curve(y_test, sc_kNN_curve[:,1])
sc_roc_auc2 = auc(sc_fpr2, sc_tpr2)
print("Area under the ROC curvre : %f" % sc_roc_auc2)
1) Here we are with around 90% accuracy to diagnose the patient with Parkinson's. Also, the recall value is 95% which is good.
2) Also, the area under the curve is around 92%, which is fairly good than previous.
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes
# creatw the model
NB = GaussianNB()
NB.fit(scaled_x_train, y_train.ravel())
NB_predict = NB.predict(scaled_x_train)
from sklearn import metrics
print("Model Accuracy with training data: {0:.4f}".format(metrics.accuracy_score(y_train, NB_predict)))
print()
NB_test_predict = NB.predict(scaled_x_test)
from sklearn import metrics
print("Model Accuracy with testing data: {0:.4f}".format(metrics.accuracy_score(y_test, NB_test_predict)))
print()
# Classification Report
print(classification_report(y_test, NB_test_predict))
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, NB_test_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (8,6))
sns.heatmap(df_cm, annot=True, cmap='BuPu', fmt='g')
NB_curve = NB.predict_proba(scaled_x_test)
fpr3, tpr3, threshold3 = roc_curve(y_test, NB_curve[:,1])
roc_auc3 = auc(fpr3, tpr3)
print("Area under the ROC curvre : %f" % roc_auc3)
1) We got an accuracy score of around 70% with a recall value of 68% which is very less as compared to the kNN model and Logistic Regression models.
2) Also, the area under the curve is around 78%, which is also not as good as Logistic Regression and kNN model.
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
meta_classifier=lr)
print('3-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3, sclf],
['KNN',
'Random Forest',
'Naive Bayes',
'StackingClassifier']):
scores = model_selection.cross_val_score(clf, X, Y,
cv=3, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
We will build our model using the DecisionTreeClassifier function. Using default 'gini' criteria to split.
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(scaled_x_train, y_train)
print(dTree.score(scaled_x_train, y_train))
print(dTree.score(scaled_x_test, y_test))
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
fn = list(scaled_x_train)
cn = ['No', 'Yes']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=200)
plot_tree(dTree, feature_names = fn, class_names=cn, filled = True)
fig.savefig('tree.png')
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(scaled_x_train, y_train)
print(dTreeR.score(scaled_x_train, y_train))
print(dTreeR.score(scaled_x_test, y_test))
fn = list(scaled_x_train)
cn = ['No', 'Yes']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=200)
plot_tree(dTree, feature_names = fn, class_names=cn, filled = True)
fig.savefig('tree.png')
from sklearn.metrics import confusion_matrix
import seaborn as sns
print(dTreeR.score(scaled_x_test , y_test))
y_predict = dTreeR.predict(scaled_x_test)
# Classification Report
print(classification_report(y_test, y_predict))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(scaled_x_train, y_train)
y_predict = bgcl.predict(scaled_x_test)
print(bgcl.score(scaled_x_test , y_test))
# Classification Report
print(classification_report(y_test, y_predict))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
1) Here we are with around 80% accuracy to diagnose the patient with Parkinson's. Also, the recall value is 97% which is good.
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(scaled_x_train, y_train)
y_predict = abcl.predict(scaled_x_test)
print(abcl.score(scaled_x_test , y_test))
# Classification Report
print(classification_report(y_test, y_predict)) ### Observations :
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
1) Here we are with around 83% accuracy to diagnose the patient with Parkinson's. Also, the recall value is 95% which is good.
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(scaled_x_train, y_train)
y_predict = rfcl.predict(scaled_x_test)
print(rfcl.score(scaled_x_test, y_test))
cm=confusion_matrix(y_test, y_predict,labels=[0, 1])
# Classification Report
print(classification_report(y_test, y_predict))
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
1) Here we are with around 81% accuracy to diagnose the patient with Parkinson's. Also, the recall value is 97% which is good.
plt.rcParams["figure.figsize"] = (20,10)
random_seed = 7
outcome = []
model_names = []
models = [('Logistic Regression', LogisticRegression()),
('GaussianNB', GaussianNB()),
('Bagging', BaggingClassifier()),
('AdaBoost',AdaBoostClassifier()),
('Random Forest', RandomForestClassifier())]
for model_name, model in models:
k_fold_validation = model_selection.KFold(n_splits=10, random_state=random_seed)
results = model_selection.cross_val_score(model, X, Y, cv=k_fold_validation, scoring='accuracy')
outcome.append(results)
model_names.append(model_name)
output_message = "%s | Mean=%f | STD=%f" % (model_name, results.mean(), results.std())
print(output_message)
From the above, it looks like the Logistic Regression, Bagging and Random Forest methods are providing the best results (based on the ‘mean’ values).
for model_name, model in models:
model.fit(scaled_x_train, y_train)
predictions = model.predict(scaled_x_test)
print(model_name)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Machine Learning Model Comparison')
ax = fig.add_subplot(111)
plt.boxplot(outcome)
ax.set_xticklabels(model_names)
plt.show()
Above plot showing the spread of the accuracy scores across each cross validation fold for each algorithm.